###Replication code for Civilian Self Defense Militias in Civil War
###9/14/2018

#Clear workspace and set working directory to reference files
rm(list=ls())
setwd("")

##Some packages may not be necessary. Use install.packages() prior to loading if needed.
library(foreign)
library(pscl)
library(spatstat)
library(sp)
library(stargazer)
library(ggplot2)
library(car)
library(timeDate)
library(xtable)
library(arm)
library(plm)
library(MCMCpack)
library(lme4)
library(mice)
library(VIM)
library(lattice)
library(DataCombine)
library(ROCR)


########IMPORTING .CSV FILE################
Paramilitary2<-read.csv("Paramilitary_August_16.csv")

#####Fix coding of population variable (ln includes 0s)
Paramilitary2$pop_log<-NA 
Paramilitary2$pop_log<-log(Paramilitary2$pobl_tot+1)

###Testing and dropping municipalities that aren't named:
Known<-subset(Paramilitary2, (!is.na(Paramilitary2$radm2)) & (!is.na(Paramilitary2$municipio)))
#detach(Known) - if needed to use other datasets
attach(Known)


Known$left_presence<-ifelse(Known$FARC==1 & Known$ELN==1, 1, 0)
table(Known$left_presence)

######QUALITATIVE ANALYSIS -- values of key municipalities ##########

#For Jordan case - low tax value  
pop_attacks[which(radm2=="JORDAN")]
drugs[which(radm2=="JORDAN")]
AUC[which(radm2=="JORDAN")]
tribut_2_log[which(radm2=="JORDAN")]
disbogota[which(radm2=="JORDAN")]
pop_log[which(radm2=="JORDAN")]


#For Bojaya case - median tax value 
radm2[which(tribut_2_log==median(na.omit(tribut_2_log)))]
pop_attacks[which(radm2=="BOJAYA (BELLA VISTA)")]
AUC[which(radm2=="BOJAYA (BELLA VISTA)")]
drugs[which(radm2=="BOJAYA (BELLA VISTA)")]
tribut_2_log[which(radm2=="BOJAYA (BELLA VISTA)")]
disbogota[which(radm2=="BOJAYA (BELLA VISTA)")]
pop_log[which(radm2=="BOJAYA (BELLA VISTA)")]

#For Yumbo - high tax value (industrial center)
radm2[which(AUC==0 & tribut_2_log>=10)]
pop_attacks[which(tribut_2_log=="YUMBO")]
AUC[which(radm2=="YUMBO")]
drugs[which(radm2=="YUMBO")]
tribut_2_log[which(radm2=="YUMBO")]
disbogota[which(radm2=="YUMBO")]
pop_log[which(radm2=="YUMBO")]

#####GENERATING DATA FOR SPATIAL ANALYSIS -- see .dta for producing tests and maps 
######### MICE for complete dataset for spatial analysis #######

#subset to variables used in primary model for easier analysis 
sp<-subset(Known, select=c(AUC, id, tribut_2_log, tribut_2_log_sq, pop_log, disbogota, drugs, ano, caqueta, meta, pop_attacks, putumayo, guaviare))

md.pattern(sp)

imp <- mice(sp, m=5, printFlag=FALSE, maxit = 40, seed=2525)

imp1<-complete(imp, 1)
imp2<-complete(imp, 2)
imp3<-complete(imp, 3)
imp4<-complete(imp, 4)
imp5<-complete(imp, 5)


#export these datasets for analysis in Stata

write.dta(imp1, "Imputed1_51318.dta")
write.dta(imp2, "Imputed2_51318.dta")
write.dta(imp3, "Imputed3_51318.dta")
write.dta(imp4, "Imputed4_51318.dta")
write.dta(imp5, "Imputed5_51318.dta")


##############ANALYSIS AND PLOTS, MAIN TEXT################

##### PRIMARY MODEL:

main.model<-with(Known, glm(AUC~tribut_2_log+tribut_2_log_sq+pop_attacks+disbogota+ropiumid.2+rcocaid+rcannabisid+pop_log+caqueta+meta+putumayo+guaviare+as.factor(ano), family=binomial(link=probit)))
summary(main.model)
xtable(main.model)
stargazer(main.model)

###### Code for Figure 2 (Predicted Probability of AUC presence):

fixed_97<- ifelse(ano==1997,1,0)
fixed_98<- ifelse(ano==1998,1,0)
fixed_99<- ifelse(ano==1999,1,0)
fixed_00<- ifelse(ano==2000,1,0)
fixed_01<- ifelse(ano==2001,1,0)
fixed_02<- ifelse(ano==2002,1,0)
fixed_03<- ifelse(ano==2003,1,0)
fixed_04<- ifelse(ano==2004,1,0)
fixed_05<- ifelse(ano==2005,1,0)
fixed_06<- ifelse(ano==2006,1,0)
fixed_07<- ifelse(ano==2007,1,0)
fixed_08<- ifelse(ano==2008,1,0)

ingres_min<-min(tribut_2_log, na.rm=T)
ingres_max<-max(tribut_2_log, na.rm=T)
ingres_vec<-seq(from=ingres_min,to=ingres_max, length=100)
trans_vec<-(ingres_vec^2)

logit_const <- na.omit(data.frame(intercept=1,tribut_2_log,tribut_2_log_sq,pop_attacks,
                                    disbogota,ropiumid.2, rcocaid, rcannabisid, pop_log,caqueta, meta, putumayo, guaviare, fixed_97, fixed_98,
                                    fixed_99, fixed_00, fixed_01, fixed_02, fixed_03, fixed_04, fixed_05,
                                    fixed_06, fixed_07, fixed_08)) 


#Make a hypothetical variable. Distance at 3rd quartile, coca on, other drugs off (0), population at mean, pop_attacks at mean, year 2003, FARC-controlled depts set to 0 
hyp_main<- matrix(apply(logit_const,2,mean),nrow=1000,ncol=25,byrow=T)
hyp_main[,2]<-ingres_vec
hyp_main[,3] <- trans_vec
hyp_main[,5] <- (quantile(disbogota, probs=.75))
hyp_main[,6] <- 0
hyp_main[,7] <- 1
hyp_main[,8] <- 0
hyp_main[,10] <- 0
hyp_main[,11] <- 0
hyp_main[,12] <- 0
hyp_main[,13] <- 0
hyp_main[,14] <- 0
hyp_main[,15] <- 0
hyp_main[,16] <- 0
hyp_main[,17] <- 0
hyp_main[,18] <- 0
hyp_main[,19] <- 0
hyp_main[,20] <- 1
hyp_main[,21] <- 0
hyp_main[,22] <- 0
hyp_main[,23] <- 0
hyp_main[,24] <- 0
hyp_main[,25] <- 0


colnames(main.model) <- c("Intercept", "Tax Revenue", "Tax Revenue Squared", "Attacks on Population", "Distance", "Opium", 
                         "Coca", "Cannabis", "Ln(Population)",
                         "Caqueta", "Meta", "Putumayo","Guaviare", 
                         "fixed_97", "fixed_98",
                         "fixed_99", "fixed_00", "fixed_01", "fixed_02", "fixed_03", "fixed_04", "fixed_05",
                         "fixed_06", "fixed_07", "fixed_08")

#Making an empty matrix of values to fill with simulated predicted probabilities
set.seed(18765)
N_sim<-1000
#Extracting betas and constructing vcv
Betas_main<-main.model$coef
vcv_main<- vcov(main.model)
#Using the mvrnorm function with the Betas, vcv, and # of simulations, to simulate
#parameter estimates. Will have 1,000 estimates of each Beta.
Sims_main<- mvrnorm(N_sim,Betas_main, vcv_main)
pprobs_rev <- matrix(NA,N_sim,1000)


for(s in 1:N_sim)
{
  ## Iteration
  print(paste('Iteration', s, 'of', nrow(pprobs_rev)))
  
  pprobs_rev[s,] <- invlogit(as.matrix(hyp_main)%*%Sims_main[s, ]) 
}

## CIs
pe_rev <- apply(pprobs_rev, 2, mean)
lo_rev <- apply(pprobs_rev, 2, quantile, prob = .025)
hi_rev<- apply(pprobs_rev, 2, quantile, prob = .975)

sim.output_rev <- cbind(hyp_main[,2],pe_rev,lo_rev,hi_rev)

base_plot <- ggplot() + labs(x="Revenue", y="Probability of Militia Presence") +
  geom_line(aes(hyp_main[,2],sim.output_rev[,2],color="Predicted Probability Over Values of Municipal Revenue"), size=1) +
  geom_ribbon(aes(x=sim.output_rev[,1],ymin=sim.output_rev[,3],ymax=sim.output_rev_5[,4],fill="95% Confidence Interval (Simulation)"), alpha=.5)+
  theme_bw() +
  theme(legend.title=element_text(), legend.position="bottom", panel.grid.minor.x=element_blank(),  panel.grid.minor.y=element_blank()) +
  scale_fill_manual(values="gray62",name="")+
  scale_color_manual(values="black",name="")

pdf("/Users/")
base_plot
dev.off()

#########Code for Figure 3 (ROC curve):
#####Limiting AUC values to those that are contained in the model:

AUC_2<-AUC[which(tribut_2_log!='NA' & pop_attacks!='NA'& disbogota!='NA' & drugs!='NA' & pop_log!='NA' &
                   caqueta!='NA')]
AUC_3<-na.omit(AUC_2)

predicted <- predict(main.model) #get fitted vals the standard way
prob <- prediction(predicted, as.numeric(AUC_3)) #this package requires a specific type of object (prediction class) 
tprfpr <- performance(prob, "tpr", "fpr") #separate into true positive/false positive
tpr <- unlist(slot(tprfpr, "y.values")) #make object compatible for plotting
fpr <- unlist(slot(tprfpr, "x.values"))
roc <- data.frame(tpr, fpr) #bind in data frame for ggplot

#For total area under curve:
performance(prob, measure = "auc")

#below is pretty straightforward. If you want the area filled in, uncomment the ribbon code
ROC_plot<-ggplot(roc) + geom_line(aes(x = fpr, y = tpr))+ 
  geom_abline(intercept = 0, slope = 1, colour = "gray")+
  #geom_ribbon(alpha=0.2, x=fpr, ymin=0, ymax=tpr, fill="blue")
  ylab("True Positive Rate: Sensitivity") + 
  xlab("False Positive Rate:1 - Specificity")+ theme_bw()

pdf("/Users/")
ROC_plot
dev.off()


#############DESCRIPTIVE STATISTICS, ANALYSIS, AND PLOTS: APPENDIX######### 

#Table 4 in appendix is the same as above, but with year fixed effects reported.

#Table 5: 
#Model with drug type not disaggregated 
appendix_1<-glm(AUC~tribut_2_log+tribut_2_log_sq+pop_attacks+disbogota+drugs+pop_log+
                caqueta+meta+putumayo+guaviare+as.factor(ano), family=binomial(link=probit), data=Known) 
summary(appendix_1)
stargazer(appendix_1)

#Table 6: 
#No violencia, with FARC departments, no year F.E.s 
appendix_2<-glm(AUC~tribut_2_log+tribut_2_log_sq+pop_attacks+disbogota+drugs+pop_log+
                caqueta+meta+putumayo+guaviare, family=binomial(link=probit)) 

summary(appendix_2)
xtable(appendix_2)

#Table 7: 
#No FARC depts, with violencia and year F.E.s
appendix_3<-glm(AUC~tribut_2_log+tribut_2_log_sq+pop_attacks+disbogota+Violencia_48_a_53+drugs+
                pop_log+as.factor(ano), family=binomial(link=probit)) 
summary(appendix_3)
xtable(appendix_3)

#Table 8: 
#Distance from Department capital (instead of Bogota)
appendix_4<-glm(AUC~tribut_2_log+tribut_2_log_sq+pop_attacks+discapital+drugs+pop_log+
                caqueta+meta+putumayo+guaviare+as.factor(ano), family=binomial(link=probit))
summary(appendix_4)
xtable(appendix_4)

#Table 9: 
#Poisson for AUC counts 
appendix_5<-glm(tpobc_AUC~tribut_2_log+tribut_2_log_sq+pop_attacks+discapital+drugs+pop_log+
                  caqueta+meta+putumayo+guaviare+as.factor(ano), family=poisson)
summary(appendix_5)
xtable(appendix_5)

#Table 10:
#Limit data to 2005 
Known_2005<-Known[which(ano<2005),]
appendix_6<-glm(Known_2005$AUC~Known_2005$tribut_2_log+Known_2005$tribut_2_log_sq+Known_2005$pop_attacks+Known_2005$discapital+Known_2005$drugs+Known_2005$pop_log+
                  Known_2005$caqueta+Known_2005$meta+Known_2005$putumayo+Known_2005$guaviare+as.factor(Known_2005$ano), family=binomial(link=probit))
summary(appendix_6)
xtable(appendix_6)

#Table 11: 
#With lagged IV (taxes)
#Generate lag for tribut_2_log
Known_2<- slide(Known, Var = "tribut_2_log", GroupVar = "id",
                   slideBy = -1)
#check
head(Known_2$`tribut_2_log-1`)
head(Known_2$tribut_2_log)

#Generate lag for tribut_2_log_sq
Known_3<- slide(Known_2, Var="tribut_2_log_sq", GroupVar="id", slideBy= -1)
#check
head(Known_3$`tribut_2_log_sq-1`)
head(Known_3$tribut_2_log_sq)

appendix_7<-with(Known_3, glm(AUC~`tribut_2_log-1`+`tribut_2_log_sq-1`+pop_attacks+disbogota+drugs+pop_log+caqueta+meta+putumayo+guaviare+as.factor(ano), family=binomial(link=probit)))
summary(appendix_7)
xtable(appendix_7)

#Table 12: 
#With coca presence interacted with tax value 
appendix_8<-glm(AUC~tribut_2_log+tribut_2_log_sq+pop_attacks+disbogota+ropiumid.2+rcocaid+rcannabisid+rcocaid*tribut_2_log+rcocaid*tribut_2_log_sq+pop_log+caqueta+meta+putumayo+guaviare+as.factor(ano), family=binomial(link=probit))
summary(appendix_8)
xtable(appendix_8)

####Figures and Descriptive Statistics:
plot_data<-na.omit(as.data.frame(cbind(tribut_2_log, tribut_2, tribut_2_log_sq, pop_attacks, disbogota, ropiumid.2, rcocaid, rcannabisid, pop_log, caqueta, meta, putumayo, guaviare, Violencia_48_a_53)))

#Figure 2 - Distribution of Ln(Tax Revenue)
pdf("/Histogram_logged_full.pdf") 
hist(plot_data$tribut_2_log, main="Distribution of Logged Income Variable", xlab="Ln(Municipal Income Tax)", col="gray62")
dev.off()

#Figure 3 - Distribution of Drug Presence
drugs_data<-cbind(plot_data$ropiumid.2, plot_data$rcocaid, plot_data$rcannabisid)
colnames(drugs_data)<-c("Opium", "Coca", "Cannabis")

pdf("/Histogram_drug_type.pdf")
barplot(apply(drugs_data, 2, table), main="Distribution of Drug Types", xlab="", col="gray62", beside=T)
dev.off()

#Figure 4 - Distribution of La Violencia 
pdf("/Histogram_Violencia_full.pdf")
barplot(table(plot_data$Violencia_48_a_53), main="Distribution of Violencia Municipalities", xlab="", col="gray62")
dev.off()

#Table 10 - Descriptive Statistics
descriptive<-rbind(summary(plot_data$pop_attacks),summary(plot_data$disbogota), summary(plot_data$pop_log))
rownames(descriptive)<-rbind("Leftist Attacks on Civilians", "Distance from Bogota", "Ln(Population)")
xtable(descriptive)





